Neteja de dades

## 'data.frame':    2800 obs. of  29 variables:
##  $ Age               : num  41 23 46 70 70 18 59 80 66 68 ...
##  $ Sex               : num  0 0 1 0 0 0 0 0 0 1 ...
##  $ On Thyroxine      : num  0 0 0 1 0 1 0 0 0 0 ...
##  $ Query on Thyroxine: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Antithyroid Med   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sick              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Pregnant          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Thyroid Surgery   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ I131 treatment    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Query Hypothyroid : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Query Hyperthyroid: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Lithium           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Goitre            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Tumor             : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ Hypopituitary     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Psych             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TSH measured      : num  1 1 1 1 1 1 0 1 1 1 ...
##  $ TSH               : num  1.3 4.1 0.98 0.16 0.72 0.03 NA 2.2 0.6 2.4 ...
##  $ T3 measured       : num  1 1 0 1 1 0 0 1 1 1 ...
##  $ T3                : num  2.5 2 NA 1.9 1.2 NA NA 0.6 2.2 1.6 ...
##  $ TT4 measured      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ TT4               : num  125 102 109 175 61 183 72 80 123 83 ...
##  $ T4U measured      : num  1 0 1 0 1 1 1 1 1 1 ...
##  $ T4U               : num  1.14 NA 0.91 NA 0.87 1.3 0.92 0.7 0.93 0.89 ...
##  $ FTI measured      : num  1 0 1 0 1 1 1 1 1 1 ...
##  $ FTI               : num  109 NA 120 NA 70 141 78 115 132 93 ...
##  $ referral sourse   : chr  "SVHC" "other" "other" "other" ...
##  $ State             : chr  "negative" "negative" "negative" "negative" ...
##  $ ID                : num  3733 1442 2965 806 2807 ...

Classificació de variables per tipus

## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Comparatives en valors de laboratori

Eliminamos Hypopituary porque solo tiene un valor distinto de 0

Causes <- c("Pregnant", "Goitre", "Tumor", "Psych")


for(i in Causes){
  data[,i] <- fun_range(data[,i])
}

means_cp <- c(mean(data_p$Pregnant, na.rm = TRUE), mean(data_p$Goitre, na.rm = TRUE), mean(data_p$Tumor, na.rm = TRUE), mean(data_p$Psych, na.rm = TRUE))

means_cn <- c(mean(data_n$Pregnant, na.rm = TRUE), mean(data_n$Goitre, na.rm = TRUE), mean(data_n$Tumor, na.rm = TRUE), mean(data_n$Psych, na.rm = TRUE))

values_means_c <- c(means_cp, means_cn)
names_means_c <- c("+ Pregnant", "+ Goitre", "+ Tumor", "+ Psych", "- Pregnant", "- Goitre", "- Tumor", "- Psych")

means_c <- data.frame(names_means_c, values_means_c)

## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend

Consideramos las 6 variables con más correlación con la variable objetivo State:

T4U(0.46), Pregnant(0.35), T3(0.34), TT4(0.26), Age(0.12), Sex(0.1)